#-------------------------Package Installer--------------------------
# load packages and install if missing
# thanks to Richard Schwinn for the original code, http://stackoverflow.com/a/33876492
# list the packages you need
p <- c('data.table', 'ggplot2', 'sf', 'psych', 'reshape2', 'tidyverse',
'dplyr', 'corrgram', 'spdep', 'mapview', 'tmap', 'ggpubr', 'spatialreg')
# this is a package loading function
loadpacks <- function(package.list = p){
new.packages <- package.list[!(package.list %in% installed.packages()[,'Package'])]
if(length(new.packages)) {
install.packages(new.packages, Ncpus = parallel::detectCores(), type = "binary", repos = "https://cran.rstudio.com")
}
lapply(eval(package.list), require, character.only = TRUE)
}
loadpacks(p) # calling function to load and/or install packages
rm(loadpacks, p) # cleanup namespace
#----------------------End of Package Installer----------------------
#------------------------------Options-------------------------------
data.table::setDTthreads(threads = parallel::detectCores())
options(scipen = 999)
#---------------------------End of Options---------------------------
main <- function() {
}
rm(main)
You will be using a number of datasets in this lab.
This dataset is a summary of income istimates (in british pounds) for 2002-2013 by London boroughs. The variables are as follows:
This dataset is an extract from the full qualifications dataset - few variables are selected form the full data set and the year is 2011. The variables are as follows:
income_est file from data subfolder into a data.table or tibble object with a name of your choice.You might need to tell the function you are using to read the csv file that you have column names in the first row.
income <- fread("data/income_est.csv", header=TRUE)
qualifications file from data subfolder into a data.table or tibble object with a name of your choice.You might need to tell the function you are using to read the csv file that you have column names in the first row. For fread() it would be a header = TRUE parameter.
qual <- fread("data/qualifications.csv", header=TRUE)
income_est filestr(income)
## Classes 'data.table' and 'data.frame': 33 obs. of 14 variables:
## $ Code : chr "E09000001" "E09000002" "E09000003" "E09000004" ...
## $ Borough: chr "City of London" "Barking and Dagenham" "Barnet" "Bexley" ...
## $ 2002 : int 42720 20120 29480 25490 24280 29550 31300 26450 27160 25100 ...
## $ 2003 : int 43620 20410 29780 25860 24400 30040 31710 26780 27350 25240 ...
## $ 2004 : int 45150 21020 30520 26620 24900 30980 32580 27500 27940 25750 ...
## $ 2005 : int 48070 22300 32200 28220 26160 32880 34450 29070 29380 27050 ...
## $ 2006 : int 48380 22390 32150 28310 26010 33010 34460 29080 29240 26900 ...
## $ 2007 : int 51410 23750 33900 30010 27340 35000 36400 30720 30740 28260 ...
## $ 2008 : int 53110 24520 34790 30950 27960 36110 37410 31570 31450 28900 ...
## $ 2009 : int 54960 25360 35770 31990 28660 37310 38510 32510 32230 29610 ...
## $ 2010 : int 57520 26550 37220 33460 29730 39020 40110 33870 33430 30700 ...
## $ 2011 : int 59240 27360 38120 34440 30370 40150 41120 34730 34140 31340 ...
## $ 2012 : int 62290 28780 39880 36210 31690 42180 43030 36370 35600 32680 ...
## $ 2013 : int 63620 29420 40530 36990 32140 43060 43750 37000 36070 33110 ...
## - attr(*, ".internal.selfref")=<externalptr>
income_est fileheadTail(income, top=3, bottom=3)
## Code Borough X2002 X2003 X2004 X2005 X2006 X2007 X2008
## 1 E09000001 City of London 42720 43620 45150 48070 48380 51410 53110
## 2 E09000002 Barking and Dagenham 20120 20410 21020 22300 22390 23750 24520
## 3 E09000003 Barnet 29480 29780 30520 32200 32150 33900 34790
## 4 <NA> <NA> ... ... ... ... ... ... ...
## 5 E09000031 Waltham Forest 23000 23320 23990 25420 25490 27000 27820
## 6 E09000032 Wandsworth 32800 33340 34370 36450 36580 38760 39960
## 7 E09000033 Westminster 33190 33740 34770 36880 36980 39150 40320
## X2009 X2010 X2011 X2012 X2013
## 1 54960 57520 59240 62290 63620
## 2 25360 26550 27360 28780 29420
## 3 35770 37220 38120 39880 40530
## 4 ... ... ... ... ...
## 5 28730 30020 30870 32420 33080
## 6 41260 43110 44330 46550 47480
## 7 41580 43380 44530 46670 47510
qualifications filestr(qual)
## Classes 'data.table' and 'data.frame': 32 obs. of 4 variables:
## $ Code : chr "E09000002" "E09000003" "E09000004" "E09000005" ...
## $ Borough : chr "Barking and Dagenham" "Barnet" "Bexley" "Brent" ...
## $ share_qualified : num 26.9 50.4 27.2 30.2 43.5 55.2 36.7 47.1 36.8 43.2 ...
## $ share_unqualified: num 13.9 6.1 8.6 11.2 7.5 8.5 8.9 11.1 8.7 10.7 ...
## - attr(*, ".internal.selfref")=<externalptr>
# no record of city of London
qual fileheadTail(qual, top=3, bottom=3)
## Code Borough share_qualified share_unqualified
## 1 E09000002 Barking and Dagenham 26.9 13.9
## 2 E09000003 Barnet 50.4 6.1
## 3 E09000004 Bexley 27.2 8.6
## 4 <NA> <NA> ... ...
## 5 E09000031 Waltham Forest 38.4 11.3
## 6 E09000032 Wandsworth 64.5 5.6
## 7 E09000033 Westminster 60.1 6.8
income_est fileTransfrom from the income_est file so that it has the following columns:
You may name the variables as you like.
Save the result to an object, give it any name you like.
income_molten <- melt(income, id.vars=c('Code', 'Borough'),
variable.name='Year', value.name = 'income_city')
income_by_year <- income_molten %>% group_by(Year) %>% summarise( MedianIncomeYear = median(income_city) )
https://stackoverflow.com/questions/54366592/r-how-to-take-median-of-rows-in-dataframe
income_new <- merge(income_molten, income_by_year,
by.x='Year', by.y='Year')
names(income_new)[5] <- 'Income'
names(income_new)[4] <- 'Income_est'
#income_new <- income_new[-c(4)]
head(income_new)
## Year Code Borough Income_est Income
## 1 2002 E09000001 City of London 42720 26820
## 2 2002 E09000002 Barking and Dagenham 20120 26820
## 3 2002 E09000003 Barnet 29480 26820
## 4 2002 E09000004 Bexley 25490 26820
## 5 2002 E09000005 Brent 24280 26820
## 6 2002 E09000006 Bromley 29550 26820
Using the dataset you created in the previous step, print top 5 boroughs by max income.
income_new_copy <- income_new %>% group_by(Borough) %>% summarise( MedianIncomeBorough = median(Income_est) )
sorted <- arrange(income_new_copy, desc(MedianIncomeBorough))
sorted[1:5,]
## # A tibble: 5 x 2
## Borough MedianIncomeBorough
## <chr> <dbl>
## 1 City of London 52260
## 2 Kensington and Chelsea 45480
## 3 Richmond upon Thames 43950
## 4 Westminster 39735
## 5 Wandsworth 39360
Using the dataset you created in the previous step, print only 6th to 10th top boroughs by income.
sorted[6:10,]
## # A tibble: 5 x 2
## Borough MedianIncomeBorough
## <chr> <dbl>
## 1 Kingston upon Thames 36965
## 2 Camden 36905
## 3 Hammersmith and Fulham 36520
## 4 Bromley 35555
## 5 Merton 34975
Create a bar plot comparing the top 10 boroughs by income.
Hint 1: use stat = 'identity' parameter for bar plot geom. Hint 2: use + coord_flip() with your plot for better readability.
https://stackoverflow.com/questions/59008974/why-is-stat-identity-necessary-in-geom-bar-in-ggplot
sorted[1:10,] %>% ggplot(aes(Borough, MedianIncomeBorough)) +
geom_bar(stat = "identity") +
coord_flip()
Select only one year worth of data from income_est (choose the year that corresponds to the year of qulifications data) and save it as a new data.table or tibble. You should keep the Code, the Borough name and the income estimate column.
2011 год
income_2011 <- filter(income_new, Year == 2011)
head(income_2011)
## Year Code Borough Income_est Income
## 1 2011 E09000001 City of London 59240 34730
## 2 2011 E09000002 Barking and Dagenham 27360 34730
## 3 2011 E09000003 Barnet 38120 34730
## 4 2011 E09000004 Bexley 34440 34730
## 5 2011 E09000005 Brent 30370 34730
## 6 2011 E09000006 Bromley 40150 34730
Merge your new income estimate table with the qualifications table and save the result as a new table (you should get a 32x5 table with Code, Borough, share qualified, share unqualified and estimated income columns, all variables should be for the same year or within 1 year):
data <- merge(income_2011, qual, by.x='Code', by.y='Code', all.x=TRUE)
names(data)[3] <- 'Borough'
data <- data %>% select(!c(Borough.y, Year, Income))
head(data)
## Code Borough Income_est share_qualified share_unqualified
## 1 E09000001 City of London 59240 NA NA
## 2 E09000002 Barking and Dagenham 27360 26.9 13.9
## 3 E09000003 Barnet 38120 50.4 6.1
## 4 E09000004 Bexley 34440 27.2 8.6
## 5 E09000005 Brent 30370 30.2 11.2
## 6 E09000006 Bromley 40150 43.5 7.5
Create a scatter plot for each pair of variables in the new table Or you can create a corrgram for all variables at once if you recall how to do it from lab 6.
panel.shadeNtext <- function (x, y, corr = NULL, col.regions, ...)
{
corr <- cor(x, y, use = "pair")
results <- cor.test(x, y, alternative = "two.sided")
est <- results$p.value
stars <- ifelse(est < 5e-4, "***",
ifelse(est < 5e-3, "**",
ifelse(est < 5e-2, "*", "")))
ncol <- 14
pal <- col.regions(ncol)
col.ind <- as.numeric(cut(corr, breaks = seq(from = -1, to = 1,
length = ncol + 1), include.lowest = TRUE))
usr <- par("usr")
rect(usr[1], usr[3], usr[2], usr[4], col = pal[col.ind],
border = NA)
box(col = "lightgray")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- formatC(corr, digits = 2, format = "f")
cex.cor <- .8/strwidth("-X.xx")
fonts <- ifelse(stars != "", 2,1)
# option 1: stars:
text(0.5, 0.4, paste0(r,"\n", stars), cex = cex.cor)
# option 2: bolding:
#text(0.5, 0.5, r, cex = cex.cor, font=fonts)
}
corrgram(data, upper.panel = panel.pts, lower.panel = panel.shadeNtext, diag.panel = panel.density)
Expectedly, a higher proportion of skilled workers is significantly positively associated with neighborhood income, while a higher proportion of unskilled workers is associated with higher income.
Test all variables for normality. Do you need to log-transform any variables? You may choose to answer that using plots or empirically (by trying to build models).
hist(data$Income_est, breaks=33)
shapiro.test(data$Income_est)
##
## Shapiro-Wilk normality test
##
## data: data$Income_est
## W = 0.89103, p-value = 0.003143
The income variable has to be transformed.
hist(data$share_qualified, breaks=33)
shapiro.test(data$share_qualified)
##
## Shapiro-Wilk normality test
##
## data: data$share_qualified
## W = 0.97594, p-value = 0.6759
hist(data$share_unqualified, breaks=33)
shapiro.test(data$share_unqualified)
##
## Shapiro-Wilk normality test
##
## data: data$share_unqualified
## W = 0.97358, p-value = 0.6034
The p-value in the Shapiro test is greater than the critical value, so we can say that we have no significant differences from the normal distribution. You can also see on the correlogram plots that the distribution is close to normal.
__
Build a multiple linear regression model. Try to predict income by the shares of qualified and unqualified workforce.
m.1 <- lm(log(Income_est) ~ share_qualified + share_unqualified, data=data)
Print model summary:
summary(m.1)
##
## Call:
## lm(formula = log(Income_est) ~ share_qualified + share_unqualified,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16413 -0.04821 -0.01657 0.04850 0.21646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.540363 0.110571 95.327 < 0.0000000000000002 ***
## share_qualified 0.005698 0.001570 3.628 0.00109 **
## share_unqualified -0.033263 0.006566 -5.066 0.0000211 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08888 on 29 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6779, Adjusted R-squared: 0.6557
## F-statistic: 30.52 on 2 and 29 DF, p-value: 0.00000007327
Interpret the model by analysing the residuals and inspecting the key model summary coefficients.
ggResidpanel::resid_panel(m.1)
The residuals do not depend on predicated values, and we can also see from the QQ-plot that the range of values is close to the line. This tells us that the model has sufficient predictive power.
Apart from code, provide a brief description of how you can interpret the results.
It can be noticed that both coefficients are significant. Like in the correlations, the increase in the share of qualified workers increases district income on average, ceteris paribus. In contrast, the increase in the share of unqualified labor workers decreases district income on average, ceteris paribus.
Do you think the model may suffer from spatial autocorrelation? You will be able to explore this later in the lab.
Load london_boroughs spatial data from data sub folder to object with any name. Make sure it has the same number of borouhgs as your table from the previous part of the analysis.
st_layers('data/london_boroughs.gpkg')
## Driver: GPKG
## Available layers:
## layer_name geometry_type features fields
## 1 london_22 Multi Polygon 15 7
## 2 london_33 Multi Polygon 20 7
## 3 london_24 Multi Polygon 33 7
## 4 london_112 Multi Polygon 29 7
## 5 london_52 Multi Polygon 3 7
london <- st_read('data/london_boroughs.gpkg', layer = 'london_24')
## Reading layer `london_24' from data source
## `C:\Users\elena\Desktop\Studies and work\Urban_Programming\lab_exam\London spatial\data\london_boroughs.gpkg'
## using driver `GPKG'
## Simple feature collection with 33 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 672920.2 ymin: 5685642 xmax: 731187.7 ymax: 5730735
## Projected CRS: WGS 84 / UTM zone 30N
london %>% st_geometry() %>% plot(lwd = 0.7)
mapview(london)
names(london)
## [1] "NAME" "GSS_CODE" "HECTARES" "NONLD_AREA" "ONS_INNER"
## [6] "SUB_2009" "SUB_2006" "geom"
str(london)
## Classes 'sf' and 'data.frame': 33 obs. of 8 variables:
## $ NAME : chr "Kingston upon Thames" "Croydon" "Bromley" "Hounslow" ...
## $ GSS_CODE : chr "E09000021" "E09000008" "E09000006" "E09000018" ...
## $ HECTARES : num 3726 8649 15013 5659 5554 ...
## $ NONLD_AREA: num 0 0 0 60.8 0 ...
## $ ONS_INNER : chr "F" "F" "F" "F" ...
## $ SUB_2009 : chr NA NA NA NA ...
## $ SUB_2006 : chr NA NA NA NA ...
## $ geom :sfc_MULTIPOLYGON of length 33; first list element: List of 1
## ..$ :List of 1
## .. ..$ : num [1:1271, 1:2] 685963 685968 685974 685981 685989 ...
## ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
## - attr(*, "sf_column")= chr "geom"
## - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA
## ..- attr(*, "names")= chr [1:7] "NAME" "GSS_CODE" "HECTARES" "NONLD_AREA" ...
Merge/join your table (the one that you used for modelling above) to the spatial object. Show the structure of the resulting spatial object.
spatial_dat <- merge(london, data, by.x = "GSS_CODE", by.y = 'Code', all.x=TRUE)
str(spatial_dat)
## Classes 'sf' and 'data.frame': 33 obs. of 12 variables:
## $ GSS_CODE : chr "E09000001" "E09000002" "E09000003" "E09000004" ...
## $ NAME : chr "City of London" "Barking and Dagenham" "Barnet" "Bexley" ...
## $ HECTARES : num 315 3780 8675 6429 4323 ...
## $ NONLD_AREA : num 24.5 169.2 0 370.6 0 ...
## $ ONS_INNER : chr "T" "F" "F" "F" ...
## $ SUB_2009 : chr NA NA NA NA ...
## $ SUB_2006 : chr NA NA NA NA ...
## $ Borough : chr "City of London" "Barking and Dagenham" "Barnet" "Bexley" ...
## $ Income_est : int 59240 27360 38120 34440 30370 40150 41120 34730 34140 31340 ...
## $ share_qualified : num NA 26.9 50.4 27.2 30.2 43.5 55.2 36.7 47.1 36.8 ...
## $ share_unqualified: num NA 13.9 6.1 8.6 11.2 7.5 8.5 8.9 11.1 8.7 ...
## $ geometry :sfc_MULTIPOLYGON of length 33; first list element: List of 1
## ..$ :List of 1
## .. ..$ : num [1:343, 1:2] 700427 700426 700425 700425 700426 ...
## ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
## - attr(*, "sf_column")= chr "geometry"
## - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "names")= chr [1:11] "GSS_CODE" "NAME" "HECTARES" "NONLD_AREA" ...
Create a map showing income. Use ggplot, tmap or mapview, or any package you want.
mapview(spatial_dat, zcol='Income_est')
We can see that lower income districts are located in the North-Eastern part of the city, while higher income districts are in the city centre and South-East. Therefore we may be dealing with spatial autocorrelation.
Now you have an option to try and improve the regression model that you have built previously. Does the model suffer from spatial autocorrelation? You may add additional code blocks to structure your code and make it cleaner.
Let us find the neighbours for each district.
neighbours <- spdep::poly2nb(spatial_dat)
neighbours
## Neighbour list object:
## Number of regions: 33
## Number of nonzero links: 136
## Percentage nonzero weights: 12.48852
## Average number of links: 4.121212
Let us construct the matrix of neghbours by using the initial sf-object.
Queen neighborhood type
lnd_nb_queen <- poly2nb(spatial_dat, queen = TRUE)
plot.nb(lnd_nb_queen, st_geometry(spatial_dat), lwd = 0.3)
Rook neighborhood type
lnd_nb_rook <- poly2nb(spatial_dat, queen = FALSE)
plot.nb(lnd_nb_rook, st_geometry(spatial_dat), lwd = 0.3)
We may estimate the number of neighbours for each district via two ways and compare in which case the variation will be lower.
n_nbrs_queen <- sapply(lnd_nb_queen, length)
n_nbrs_rook <- sapply(lnd_nb_rook, length)
compare_nbrs <- data.table(queen = n_nbrs_queen, rook = n_nbrs_rook)
compare_nbrs_long <- melt(compare_nbrs, measure.vars = c("queen", "rook"), value.name = "n_neighbours", variable.name = "type")
compare_nbrs_long %>%
ggplot() +
geom_histogram(aes(x = n_neighbours, col = type), fill = NA, binwidth = 1, position = "identity") +
facet_wrap(~type)+
theme_pubclean()
The distribution of neighbours is identical irregardless of neighborhood type.
weights <- nb2listw(lnd_nb_queen, style="W")
spatial_dat$lag_Income_est <- lag.listw(x = weights, spatial_dat$Income_est)
moran.mc(spatial_dat$Income_est, listw = weights, nsim = 999)
##
## Monte-Carlo simulation of Moran I
##
## data: spatial_dat$Income_est
## weights: weights
## number of simulations + 1: 1000
##
## statistic = 0.24636, observed rank = 981, p-value = 0.019
## alternative hypothesis: greater
According to the p-value (which is lower than the significance level), there is spatial autocorrelation of income between the districts.
local_moran_Income_est <- localmoran(x = spatial_dat$Income_est, listw = weights)
hist(local_moran_Income_est[,5], breaks = 20)
signif <- 0.05
spatial_dat$Income_est_sig <- local_moran_Income_est[,5] # get the 5th column of the matrix - the `Pr(z > 0)` column
spatial_dat$Income_est_li <- local_moran_Income_est[,1] - mean(local_moran_Income_est[,1])
spatial_dat <- spatial_dat %>% mutate(quad_sig_Income_est = case_when(Income_est_sig > signif ~ "non-significant",
Income_est > 0 & lag_Income_est > 0 & Income_est_sig <= signif ~ "high-high",
Income_est < 0 & lag_Income_est < 0 & Income_est_sig <= signif ~ "low-low",
Income_est > 0 & lag_Income_est < 0 & Income_est_sig <= signif ~ "high-low",
Income_est < 0 & lag_Income_est > 0 & Income_est_sig <= signif ~ "low-high"))
table(spatial_dat$quad_sig_Income_est)
##
## high-high non-significant
## 4 29
head(spatial_dat)
## Simple feature collection with 6 features and 15 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 684700.2 ymin: 5686446 xmax: 723412.8 ymax: 5728067
## Projected CRS: WGS 84 / UTM zone 30N
## GSS_CODE NAME HECTARES NONLD_AREA ONS_INNER SUB_2009
## 1 E09000001 City of London 314.942 24.546 T <NA>
## 2 E09000002 Barking and Dagenham 3779.934 169.150 F <NA>
## 3 E09000003 Barnet 8674.837 0.000 F <NA>
## 4 E09000004 Bexley 6428.649 370.619 F <NA>
## 5 E09000005 Brent 4323.270 0.000 F <NA>
## 6 E09000006 Bromley 15013.487 0.000 F <NA>
## SUB_2006 Borough Income_est share_qualified share_unqualified
## 1 <NA> City of London 59240 NA NA
## 2 <NA> Barking and Dagenham 27360 26.9 13.9
## 3 <NA> Barnet 38120 50.4 6.1
## 4 <NA> Bexley 34440 27.2 8.6
## 5 <NA> Brent 30370 30.2 11.2
## 6 <NA> Bromley 40150 43.5 7.5
## geometry lag_Income_est Income_est_sig Income_est_li
## 1 MULTIPOLYGON (((700427.3 57... 37496.00 0.6265667 -0.03006582
## 2 MULTIPOLYGON (((713156.7 57... 31733.33 0.1317454 0.86997600
## 3 MULTIPOLYGON (((693620.4 57... 34510.00 0.3842329 -0.30541421
## 4 MULTIPOLYGON (((716504.1 57... 36405.00 0.8798883 -0.21008148
## 5 MULTIPOLYGON (((694458.6 57... 41035.71 0.1009573 -0.82403334
## 6 MULTIPOLYGON (((709974.9 56... 34290.00 0.3006389 -0.43191213
## quad_sig_Income_est
## 1 non-significant
## 2 non-significant
## 3 non-significant
## 4 non-significant
## 5 non-significant
## 6 non-significant
We can see that the significance is added to the last column of the dataset.
map_1 <- spatial_dat %>% dplyr::select(Income_est, Income_est_sig, quad_sig_Income_est)
mapview(map_1, zcol = "quad_sig_Income_est")
The only significantly different districts are the central district of London and some of the South-Eastern districts.
system.time( man_lagsarlm <- lagsarlm(log(Income_est) ~ share_qualified + share_unqualified, data = spatial_dat, listw = weights) )
## user system elapsed
## 0.35 0.00 0.38
summary(man_lagsarlm)
##
## Call:lagsarlm(formula = log(Income_est) ~ share_qualified + share_unqualified,
## data = spatial_dat, listw = weights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.163949 -0.048236 -0.016882 0.048428 0.216464
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 10.5103469 1.8519817 5.6752 0.00000001385
## share_qualified 0.0056926 0.0015438 3.6875 0.0002265
## share_unqualified -0.0331996 0.0072318 -4.5908 0.00000441638
##
## Rho: 0.0028315, LR test value: 0.00023381, p-value: 0.9878
## Asymptotic standard error: 0.17474
## z-value: 0.016204, p-value: 0.98707
## Wald statistic: 0.00026257, p-value: 0.98707
##
## Log likelihood: 33.62559 for lag model
## ML residual variance (sigma squared): 0.0071583, (sigma: 0.084607)
## Number of observations: 32
## Number of parameters estimated: 5
## AIC: -57.251, (AIC for lm: -59.251)
## LM test for residual autocorrelation
## test value: 1.8234, p-value: 0.17691
system.time( man_errorsarlm <- errorsarlm(log(Income_est) ~ share_qualified + share_unqualified, data = spatial_dat, listw = weights) )
## user system elapsed
## 0.33 0.00 0.34
summary(man_errorsarlm)
##
## Call:
## errorsarlm(formula = log(Income_est) ~ share_qualified + share_unqualified,
## data = spatial_dat, listw = weights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.158568 -0.049496 -0.015760 0.059591 0.199064
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 10.5377073 0.1098399 95.9369 < 0.00000000000000022
## share_qualified 0.0058559 0.0015474 3.7844 0.0001541
## share_unqualified -0.0334749 0.0068105 -4.9152 0.0000008868
##
## Lambda: 0.24087, LR test value: 0.96926, p-value: 0.32486
## Asymptotic standard error: 0.21551
## z-value: 1.1177, p-value: 0.26371
## Wald statistic: 1.2492, p-value: 0.26371
##
## Log likelihood: 34.1101 for error model
## ML residual variance (sigma squared): 0.0068368, (sigma: 0.082685)
## Number of observations: 32
## Number of parameters estimated: 5
## AIC: -58.22, (AIC for lm: -59.251)
Коэффициент автокорреляции ошибок - 0.24087.
Write a simple function with two parameters.
This function must take as an sf object as a first parameter. The second parameter is a character vector of length one with the name of the variable that you want to create a histogram for. The function log-transforms the variable specified via the 2nd parameter and creates a histogram with title “Log-transformed variable histogram”.
If you cannot write such function, write a function that finds a 5th root of the product of squares of 3 numbers - all passed as arguments into the function, and the result should be divided by some number passed as the fourth argument to that function. The second and third arguments default value is 1. The first argument has no default value. The output of the function is a printout like You have entered numbers 3, 7 and 8. The result rounded to 2 digits is: 7.76. In any case, try to build the function in its basic form. You may not, for example, be able to create a proper printout for the function, but the result of the calculation may be correct.
Define your function below:
fun_1 <- function(sf_obj,
var) {
sf_obj <- sf_obj %>% st_set_geometry(NULL)
var_vector = log(sf_obj[,var])
hist_toplot <- hist(var_vector, main="Log-transformed variable histogram", breaks=33)
return(hist_toplot)
}
Test your function three times with different values for parameters in the code chunk below:
fun_1(spatial_dat, 'Income_est')
## $breaks
## [1] 10.18 10.20 10.22 10.24 10.26 10.28 10.30 10.32 10.34 10.36 10.38 10.40
## [13] 10.42 10.44 10.46 10.48 10.50 10.52 10.54 10.56 10.58 10.60 10.62 10.64
## [25] 10.66 10.68 10.70 10.72 10.74 10.76 10.78 10.80 10.82 10.84 10.86 10.88
## [37] 10.90 10.92 10.94 10.96 10.98 11.00
##
## $counts
## [1] 1 1 0 0 0 0 0 2 1 0 3 3 1 6 0 1 1 2 1 1 0 1 3 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0
## [39] 0 0 1
##
## $density
## [1] 1.515152 1.515152 0.000000 0.000000 0.000000 0.000000 0.000000 3.030303
## [9] 1.515152 0.000000 4.545455 4.545455 1.515152 9.090909 0.000000 1.515152
## [17] 1.515152 3.030303 1.515152 1.515152 0.000000 1.515152 4.545455 0.000000
## [25] 0.000000 1.515152 1.515152 0.000000 0.000000 0.000000 0.000000 1.515152
## [33] 0.000000 1.515152 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [41] 1.515152
##
## $mids
## [1] 10.19 10.21 10.23 10.25 10.27 10.29 10.31 10.33 10.35 10.37 10.39 10.41
## [13] 10.43 10.45 10.47 10.49 10.51 10.53 10.55 10.57 10.59 10.61 10.63 10.65
## [25] 10.67 10.69 10.71 10.73 10.75 10.77 10.79 10.81 10.83 10.85 10.87 10.89
## [37] 10.91 10.93 10.95 10.97 10.99
##
## $xname
## [1] "var_vector"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
fun_1(spatial_dat, 'share_qualified')
## $breaks
## [1] 2.90 2.95 3.00 3.05 3.10 3.15 3.20 3.25 3.30 3.35 3.40 3.45 3.50 3.55 3.60
## [16] 3.65 3.70 3.75 3.80 3.85 3.90 3.95 4.00 4.05 4.10 4.15 4.20
##
## $counts
## [1] 1 0 0 0 0 0 0 1 1 0 1 0 0 2 4 1 0 3 2 5 2 1 3 4 0 1
##
## $density
## [1] 0.625 0.000 0.000 0.000 0.000 0.000 0.000 0.625 0.625 0.000 0.625 0.000
## [13] 0.000 1.250 2.500 0.625 0.000 1.875 1.250 3.125 1.250 0.625 1.875 2.500
## [25] 0.000 0.625
##
## $mids
## [1] 2.925 2.975 3.025 3.075 3.125 3.175 3.225 3.275 3.325 3.375 3.425 3.475
## [13] 3.525 3.575 3.625 3.675 3.725 3.775 3.825 3.875 3.925 3.975 4.025 4.075
## [25] 4.125 4.175
##
## $xname
## [1] "var_vector"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
fun_1(spatial_dat, 'share_unqualified')
## $breaks
## [1] 1.40 1.45 1.50 1.55 1.60 1.65 1.70 1.75 1.80 1.85 1.90 1.95 2.00 2.05 2.10
## [16] 2.15 2.20 2.25 2.30 2.35 2.40 2.45 2.50 2.55 2.60 2.65
##
## $counts
## [1] 1 0 0 0 2 0 1 0 1 0 1 1 4 1 1 3 0 1 3 2 5 1 1 0 3
##
## $density
## [1] 0.625 0.000 0.000 0.000 1.250 0.000 0.625 0.000 0.625 0.000 0.625 0.625
## [13] 2.500 0.625 0.625 1.875 0.000 0.625 1.875 1.250 3.125 0.625 0.625 0.000
## [25] 1.875
##
## $mids
## [1] 1.425 1.475 1.525 1.575 1.625 1.675 1.725 1.775 1.825 1.875 1.925 1.975
## [13] 2.025 2.075 2.125 2.175 2.225 2.275 2.325 2.375 2.425 2.475 2.525 2.575
## [25] 2.625
##
## $xname
## [1] "var_vector"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
Rewrite your function in such a way that it checks if the name of the variable to log-transform and creates a histogram that you are passing to it actually exists in the dataset. If it does not exist, return and error message with print() instead of trying to plot the variable that does not exist.
fun_1 <- function(sf_obj,
var) {
if(var %in% colnames(sf_obj)){
sf_obj <- sf_obj %>% st_set_geometry(NULL)
var_vector = log(sf_obj[,var])
hist_toplot <- hist(var_vector, main="Log-transformed variable histogram", breaks=33)
return(hist_toplot)
}
else {
return(print('Error! This variable does not exist'))
}
}
fun_1(spatial_dat, 'ty')
## [1] "Error! This variable does not exist"